getwd()
## [1] "/Users/rtpw/Dropbox/01_GITHUBREPO/ELT-2-ChIP-revision/Rob/03_emb_L1_L3_intestine_RNAseq/02_scripts"
# import counts
countsData <- read.delim(file = "../01_input/all.counts", sep = " ")
# preview counts
head(countsData)
## chr start stop strand length embryo_cells_rep1
## WBGene00014450 MtDNA 1 55 + 55 0
## WBGene00014451 MtDNA 58 111 + 54 0
## WBGene00010957 MtDNA 113 549 + 437 0
## WBGene00010958 MtDNA 549 783 + 235 0
## WBGene00014452 MtDNA 785 840 + 56 0
## WBGene00014453 MtDNA 842 896 + 55 0
## embryo_cells_rep2 embryo_GFPminus_rep1 embryo_GFPminus_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryo_GFPminus_rep3 embryo_GFPplus_rep1 embryo_GFPplus_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryo_GFPplus_rep3 embryo_whole_rep2 embryo_whole_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## L1_cells_rep1 L1_cells_rep2 L1_cells_rep3 L1_GFPminus_rep1
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L1_GFPminus_rep2 L1_GFPminus_rep3 L1_GFPplus_rep1
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## L1_GFPplus_rep2 L1_GFPplus_rep3 L1_whole_rep1 L1_whole_rep2
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L1_whole_rep3 L3_cells_rep1 L3_cells_rep2 L3_cells_rep3
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_GFPminus_rep1 L3_GFPplus_rep2 L3_GFPminus_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## L3_GFPplus_rep1 L3_GFPminus_rep2 L3_GFPplus_rep3 L3_whole_rep1
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_whole_rep2 L3_whole_rep3
## WBGene00014450 0 0
## WBGene00014451 0 0
## WBGene00010957 0 0
## WBGene00010958 0 0
## WBGene00014452 0 0
## WBGene00014453 0 0
# print samples
colnames(countsData[6:ncol(countsData)])
## [1] "embryo_cells_rep1" "embryo_cells_rep2" "embryo_GFPminus_rep1"
## [4] "embryo_GFPminus_rep2" "embryo_GFPminus_rep3" "embryo_GFPplus_rep1"
## [7] "embryo_GFPplus_rep2" "embryo_GFPplus_rep3" "embryo_whole_rep2"
## [10] "embryo_whole_rep3" "L1_cells_rep1" "L1_cells_rep2"
## [13] "L1_cells_rep3" "L1_GFPminus_rep1" "L1_GFPminus_rep2"
## [16] "L1_GFPminus_rep3" "L1_GFPplus_rep1" "L1_GFPplus_rep2"
## [19] "L1_GFPplus_rep3" "L1_whole_rep1" "L1_whole_rep2"
## [22] "L1_whole_rep3" "L3_cells_rep1" "L3_cells_rep2"
## [25] "L3_cells_rep3" "L3_GFPminus_rep1" "L3_GFPplus_rep2"
## [28] "L3_GFPminus_rep3" "L3_GFPplus_rep1" "L3_GFPminus_rep2"
## [31] "L3_GFPplus_rep3" "L3_whole_rep1" "L3_whole_rep2"
## [34] "L3_whole_rep3"
# import metadata and process file
metadata1 <- read.table(file = "../01_input/RWP27_metadata.tsv", header = FALSE, stringsAsFactors = FALSE) %>% bind_rows(read.table(file = "../01_input/RWP26_metadata.tsv", header = FALSE, stringsAsFactors = FALSE)) %>%
bind_rows(read.table(file = "../01_input/RWP30_metadata.tsv", header = FALSE, stringsAsFactors = FALSE))
colnames(metadata1) <- c("Filename.Fwd", "Filename.Rev", "names")
head(metadata1)
## Filename.Fwd Filename.Rev names
## 1 RW57_S10_L003_R1_001 RW57_S10_L003_R2_001 embryo_cells_rep1
## 2 RW58_S11_L003_R1_001 RW58_S11_L003_R2_001 embryo_GFPplus_rep1
## 3 RW59_S12_L003_R1_001 RW59_S12_L003_R2_001 embryo_GFPminus_rep1
## 4 RW60_S13_L003_R1_001 RW60_S13_L003_R2_001 embryo_whole_rep2
## 5 RW61_S14_L003_R1_001 RW61_S14_L003_R2_001 embryo_cells_rep2
## 6 RW62_S15_L003_R1_001 RW62_S15_L003_R2_001 embryo_GFPplus_rep2
# separate and process sample info
metadata1 <- metadata1 %>% separate(names, sep = "_", into = c("stage", "sample", "rep"), remove = FALSE)
metadata1 <- metadata1 %>% mutate(stage = fct_relevel(stage, c("embryo", "L1", "L3")),
sample = fct_relevel(sample, c("whole", "cells", "GFPplus", "GFPminus")),
rep = fct_relevel(rep, c("rep1", "rep2", "rep3")),
names = fct_relevel(names, metadata1$names)
)
# Order columns according to metadata1 order
countsData <- countsData %>% select(chr:length, sort(metadata1$names))
head(countsData)
## chr start stop strand length embryo_cells_rep1
## WBGene00014450 MtDNA 1 55 + 55 0
## WBGene00014451 MtDNA 58 111 + 54 0
## WBGene00010957 MtDNA 113 549 + 437 0
## WBGene00010958 MtDNA 549 783 + 235 0
## WBGene00014452 MtDNA 785 840 + 56 0
## WBGene00014453 MtDNA 842 896 + 55 0
## embryo_GFPplus_rep1 embryo_GFPminus_rep1 embryo_whole_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryo_cells_rep2 embryo_GFPplus_rep2 embryo_GFPminus_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryo_whole_rep3 embryo_GFPplus_rep3 embryo_GFPminus_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## L1_whole_rep1 L1_cells_rep1 L1_GFPplus_rep1 L1_GFPminus_rep1
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L1_whole_rep2 L1_cells_rep2 L1_GFPplus_rep2 L1_GFPminus_rep2
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L1_whole_rep3 L1_cells_rep3 L1_GFPplus_rep3 L1_GFPminus_rep3
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_whole_rep1 L3_cells_rep1 L3_GFPplus_rep1 L3_GFPminus_rep1
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_whole_rep2 L3_cells_rep2 L3_GFPminus_rep2 L3_GFPplus_rep2
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_whole_rep3 L3_cells_rep3 L3_GFPplus_rep3 L3_GFPminus_rep3
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
# Generate a table called "cts" out of the countsData table. Subset the countsData.
cts <- as.matrix(countsData %>% select(metadata1$names))
head(cts)
## embryo_cells_rep1 embryo_GFPplus_rep1 embryo_GFPminus_rep1
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryo_whole_rep2 embryo_cells_rep2 embryo_GFPplus_rep2
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryo_GFPminus_rep2 embryo_whole_rep3 embryo_GFPplus_rep3
## WBGene00014450 0 0 0
## WBGene00014451 0 0 0
## WBGene00010957 0 0 0
## WBGene00010958 0 0 0
## WBGene00014452 0 0 0
## WBGene00014453 0 0 0
## embryo_GFPminus_rep3 L1_whole_rep1 L1_cells_rep1 L1_GFPplus_rep1
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L1_GFPminus_rep1 L1_whole_rep2 L1_cells_rep2 L1_GFPplus_rep2
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L1_GFPminus_rep2 L1_whole_rep3 L1_cells_rep3 L1_GFPplus_rep3
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L1_GFPminus_rep3 L3_whole_rep1 L3_cells_rep1 L3_GFPplus_rep1
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_GFPminus_rep1 L3_whole_rep2 L3_cells_rep2 L3_GFPminus_rep2
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_GFPplus_rep2 L3_whole_rep3 L3_cells_rep3 L3_GFPplus_rep3
## WBGene00014450 0 0 0 0
## WBGene00014451 0 0 0 0
## WBGene00010957 0 0 0 0
## WBGene00010958 0 0 0 0
## WBGene00014452 0 0 0 0
## WBGene00014453 0 0 0 0
## L3_GFPminus_rep3
## WBGene00014450 0
## WBGene00014451 0
## WBGene00010957 0
## WBGene00010958 0
## WBGene00014452 0
## WBGene00014453 0
# Reorganize the metadata table so the names2 column are now headers
rownames(metadata1)<- metadata1$names
coldata <- metadata1[,c("names", "stage", "sample", "rep")]
rownames(coldata) <- as.vector(metadata1$names)
# make grouping variable
coldata$group <- factor(paste0(coldata$stage, coldata$sample))
coldata
## names stage sample rep group
## embryo_cells_rep1 embryo_cells_rep1 embryo cells rep1 embryocells
## embryo_GFPplus_rep1 embryo_GFPplus_rep1 embryo GFPplus rep1 embryoGFPplus
## embryo_GFPminus_rep1 embryo_GFPminus_rep1 embryo GFPminus rep1 embryoGFPminus
## embryo_whole_rep2 embryo_whole_rep2 embryo whole rep2 embryowhole
## embryo_cells_rep2 embryo_cells_rep2 embryo cells rep2 embryocells
## embryo_GFPplus_rep2 embryo_GFPplus_rep2 embryo GFPplus rep2 embryoGFPplus
## embryo_GFPminus_rep2 embryo_GFPminus_rep2 embryo GFPminus rep2 embryoGFPminus
## embryo_whole_rep3 embryo_whole_rep3 embryo whole rep3 embryowhole
## embryo_GFPplus_rep3 embryo_GFPplus_rep3 embryo GFPplus rep3 embryoGFPplus
## embryo_GFPminus_rep3 embryo_GFPminus_rep3 embryo GFPminus rep3 embryoGFPminus
## L1_whole_rep1 L1_whole_rep1 L1 whole rep1 L1whole
## L1_cells_rep1 L1_cells_rep1 L1 cells rep1 L1cells
## L1_GFPplus_rep1 L1_GFPplus_rep1 L1 GFPplus rep1 L1GFPplus
## L1_GFPminus_rep1 L1_GFPminus_rep1 L1 GFPminus rep1 L1GFPminus
## L1_whole_rep2 L1_whole_rep2 L1 whole rep2 L1whole
## L1_cells_rep2 L1_cells_rep2 L1 cells rep2 L1cells
## L1_GFPplus_rep2 L1_GFPplus_rep2 L1 GFPplus rep2 L1GFPplus
## L1_GFPminus_rep2 L1_GFPminus_rep2 L1 GFPminus rep2 L1GFPminus
## L1_whole_rep3 L1_whole_rep3 L1 whole rep3 L1whole
## L1_cells_rep3 L1_cells_rep3 L1 cells rep3 L1cells
## L1_GFPplus_rep3 L1_GFPplus_rep3 L1 GFPplus rep3 L1GFPplus
## L1_GFPminus_rep3 L1_GFPminus_rep3 L1 GFPminus rep3 L1GFPminus
## L3_whole_rep1 L3_whole_rep1 L3 whole rep1 L3whole
## L3_cells_rep1 L3_cells_rep1 L3 cells rep1 L3cells
## L3_GFPplus_rep1 L3_GFPplus_rep1 L3 GFPplus rep1 L3GFPplus
## L3_GFPminus_rep1 L3_GFPminus_rep1 L3 GFPminus rep1 L3GFPminus
## L3_whole_rep2 L3_whole_rep2 L3 whole rep2 L3whole
## L3_cells_rep2 L3_cells_rep2 L3 cells rep2 L3cells
## L3_GFPminus_rep2 L3_GFPminus_rep2 L3 GFPminus rep2 L3GFPminus
## L3_GFPplus_rep2 L3_GFPplus_rep2 L3 GFPplus rep2 L3GFPplus
## L3_whole_rep3 L3_whole_rep3 L3 whole rep3 L3whole
## L3_cells_rep3 L3_cells_rep3 L3 cells rep3 L3cells
## L3_GFPplus_rep3 L3_GFPplus_rep3 L3 GFPplus rep3 L3GFPplus
## L3_GFPminus_rep3 L3_GFPminus_rep3 L3 GFPminus rep3 L3GFPminus
# Check that the names match --> Should be TRUE
all(rownames(coldata) == colnames(cts))
## [1] TRUE
Generate the DESeqDataSet. The variables in this design formula will be the type of sample, and the preparation date. This should reduce the variability between the samples based on when they were made.
From the vignette: “In order to benefit from the default settings of the package, you should put the variable of interest at the end of the formula and make sure the control level is the first level.”
The variable of interest is the sample type.
Using DESeqDataSetFromMatrix since I used the program featureCounts.
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ group)
Visualize read count distribution
raw_count_threshold <- 10
hist(log10(rowSums(counts(dds))), breaks = 50)
abline(v = log10(raw_count_threshold), col = "red", lty = 2)
# Filter genes with sum counts per million >= 10 across all samples
cpm <- apply(counts(dds),2, function(x) (x/sum(x))*1000000)
hist(log10(rowSums(cpm)))
abline(v = log10(raw_count_threshold), col = "red", lty = 2)
Filter genes with low read counts
keep <- rowSums(cpm) >= raw_count_threshold
dds <- dds[keep,]
dds
## class: DESeqDataSet
## dim: 16762 34
## metadata(1): version
## assays(1): counts
## rownames(16762): WBGene00021406 WBGene00021407 ... WBGene00199694
## WBGene00044951
## rowData names(0):
## colnames(34): embryo_cells_rep1 embryo_GFPplus_rep1 ... L3_GFPplus_rep3
## L3_GFPminus_rep3
## colData names(5): names stage sample rep group
Perform Differential Expression
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept" "group_embryoGFPminus_vs_embryocells"
## [3] "group_embryoGFPplus_vs_embryocells" "group_embryowhole_vs_embryocells"
## [5] "group_L1cells_vs_embryocells" "group_L1GFPminus_vs_embryocells"
## [7] "group_L1GFPplus_vs_embryocells" "group_L1whole_vs_embryocells"
## [9] "group_L3cells_vs_embryocells" "group_L3GFPminus_vs_embryocells"
## [11] "group_L3GFPplus_vs_embryocells" "group_L3whole_vs_embryocells"
vsd <- vst(dds, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- vsd$names
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)
vsd.corr.per.stage <- function(x, main){
vsd <- assay(vsd)[,metadata1 %>% filter(grepl(x, names)) %>% pull(names)]
sampleDists <- dist(t(vsd))
sampleDistMatrix <- as.matrix(sampleDists)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors,
main = main)
}
vsd.corr.per.stage("embryo", "Embryo Stage Intestine FACS RNA-seq Correlation")
vsd.corr.per.stage("L1", "L1 Stage Intestine FACS RNA-seq Correlation")
vsd.corr.per.stage("L3", "L3 Stage Intestine FACS RNA-seq Correlation")
vsd.corr.per.stage("GFPplus|GFPminus", "Correlation of FACS isolated GFP+ and GFP- samples")
Per-stage GFP+ and GFP- correlation
vsd.corr.per.stage("embryo_GFPplus|embryo_GFPminus", "Correlation of Embryo FACS isolated GFP+ and GFP- samples")
vsd.corr.per.stage("L1_GFPplus|L1_GFPminus", "Correlation of L1 FACS isolated GFP+ and GFP- samples")
vsd.corr.per.stage("L3_GFPplus|L3_GFPminus", "Correlation of L3 FACS isolated GFP+ and GFP- samples")
# Remove L1 rep 2
remove_samples <- c("L1_whole_rep2", "L1_cells_rep2", "L1_GFPplus_rep2", "L1_GFPminus_rep2")
coldata <- coldata %>% filter(!names %in% remove_samples)
dds <- dds[,!colnames(dds)%in% remove_samples]
dds <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept" "group_embryoGFPminus_vs_embryocells"
## [3] "group_embryoGFPplus_vs_embryocells" "group_embryowhole_vs_embryocells"
## [5] "group_L1cells_vs_embryocells" "group_L1GFPminus_vs_embryocells"
## [7] "group_L1GFPplus_vs_embryocells" "group_L1whole_vs_embryocells"
## [9] "group_L3cells_vs_embryocells" "group_L3GFPminus_vs_embryocells"
## [11] "group_L3GFPplus_vs_embryocells" "group_L3whole_vs_embryocells"
shrunk_pairwise_array_df <- function(stage) {
samples <- paste(stage, c("whole", "GFPplus", "cells", "GFPminus"), sep = "")
combos <- combn(samples, 2, simplify = FALSE)
# print(combos)
all_pairwise_comparisons <- data.frame()
for (i in 1:length(combos)) {
tobind <-
as.data.frame(lfcShrink(
dds,
contrast = c("group", combos[[i]][1], combos[[i]][2]),
type = "ashr",
quiet = TRUE
)) %>%
rownames_to_column(var = "WBGeneID") %>%
mutate(comparison = paste(combos[[i]][1], combos[[i]][2], sep = "_vs_")) %>%
mutate(label = str_remove_all(comparison, "embryo|L1|L3"))
all_pairwise_comparisons <- bind_rows(all_pairwise_comparisons, tobind)
}
all_pairwise_comparisons
}
MA_plot_array <- function(in.df, title, sig){
ggplot(in.df %>% mutate(padj = replace_na(padj, 1)), aes(x = log10(baseMean), y = log2FoldChange, color = padj < sig)) +
geom_point(shape = 16, alpha = 0.1, stroke = 0, size = 1) +
ylim(c(-10,10))+
facet_wrap(~label) +
scale_color_manual(values = c("black", "red"), name = "q.value < 0.1") +
theme_classic() +
ggtitle(title)
}
alt_hyp_res_df <- function(stage, thresh, sig){
samples <- paste(stage, c("whole", "GFPplus", "cells", "GFPminus"), sep = "")
combos <- combn(samples, 2, simplify = FALSE)
hyps = c("greater", "less", "lessAbs")
df <- data.frame()
for(i in 1:length(combos)){
for(hyp in hyps){
thresh_res <- results(dds, contrast = c("group", combos[[i]][1],combos[[i]][2]), lfcThreshold=thresh, altHypothesis = hyp, alpha = sig)
tobind <- data.frame(as.data.frame(thresh_res),
type = hyp,
comparison = paste(combos[[i]][1], combos[[i]][2], sep = "_vs_")) %>%
mutate(label = str_remove_all(comparison, "embryo|L1|L3")) %>%
rownames_to_column(var = "WBGeneID")
# tobind<-data.frame(plotMA(thresh_res, returnData = TRUE),
# comparison = paste(combos[[i]][1], combos[[i]][2], sep = "_vs_"),
# type = hyp) %>% mutate(label = str_remove_all(comparison, "embryo|L1|L3"))
df <- rbind(df, tobind)
}
}
df <- df %>%
mutate(isDE = case_when(type == "greater" & log2FoldChange >= thresh & padj <= sig ~ TRUE,
type == "less" & log2FoldChange <= thresh & padj <= sig ~ TRUE,
type == "lessAbs" & (log2FoldChange < thresh | log2FoldChange > thresh) & padj <= sig ~ TRUE,
padj > sig ~ FALSE,
is.na(padj) ~ FALSE))
df
}
de_category_MA_plot <- function(df, title){
df %>% filter(isDE == TRUE) %>%
ggplot(aes(x = log10(baseMean), y = log2FoldChange, color = type)) +
geom_point(data =df %>% mutate(padj = replace_na(padj, 1)), shape = 16, alpha = 0.01, stroke = 0, size = 1, color = "grey") +
geom_point(shape = 16, alpha = 0.5, stroke = 0, size = 1) +
ylim(c(-10,10))+
facet_wrap(~label) +
# scale_color_manual(values = c("black", "red"), name = "q.value < 0.1") +
theme_classic() +
ggtitle(title)
}
options(dplyr.summarise.inform = FALSE)
de_category_bar_plot <- function(df, title){
df %>% filter(isDE == TRUE) %>% group_by(label, type) %>% summarize(genes = n()) %>%
ggplot(aes(x = type, y = genes, label = genes, fill = type)) +
geom_bar(stat = "identity") +
geom_text(vjust = -0.25) +
facet_wrap(~label) +
theme_classic() +
ggtitle(title)
}
thresh = 1
sig = 0.01
embryo_alt_hyp_res_df <- alt_hyp_res_df("embryo", thresh = thresh, sig = sig)
# embryo_alt_hyp_res_df %>% replace_na(list(padj = 1)) %>% pull(baseMean) %>% range
de_category_MA_plot(embryo_alt_hyp_res_df, paste("Embryo differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
## Warning: Removed 120 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
de_category_bar_plot(embryo_alt_hyp_res_df, paste("Embryo differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
L1_alt_hyp_res_df<- alt_hyp_res_df("L1", thresh = thresh, sig = sig)
de_category_MA_plot(L1_alt_hyp_res_df, paste("L1 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
## Warning: Removed 108 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
de_category_bar_plot(L1_alt_hyp_res_df, paste("L1 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
L3_alt_hyp_res_df<- alt_hyp_res_df("L3", thresh = thresh, sig = sig)
de_category_MA_plot(L3_alt_hyp_res_df, paste("L3 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
de_category_bar_plot(L3_alt_hyp_res_df, paste("L3 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
embryo_pairwise_res_shrunk <- shrunk_pairwise_array_df(stage = "embryo")
MA_plot_array(embryo_pairwise_res_shrunk, "embryo FACS ashr shrunk", sig = 0.01)
## Warning: Removed 12 rows containing missing values (geom_point).
Label the shrunken plots with expression status. doesn’t work, currently giving error: Error: vector memory exhausted (limit reached?)
embryo_pairwise_res_shrunk_DEtype <- left_join(embryo_pairwise_res_shrunk, embryo_alt_hyp_res_df %>% select(WBGeneID, type, label, isDE), by = c("WBGeneID","label"))
MA_plot_array
## function(in.df, title, sig){
## ggplot(in.df %>% mutate(padj = replace_na(padj, 1)), aes(x = log10(baseMean), y = log2FoldChange, color = padj < sig)) +
## geom_point(shape = 16, alpha = 0.1, stroke = 0, size = 1) +
## ylim(c(-10,10))+
## facet_wrap(~label) +
## scale_color_manual(values = c("black", "red"), name = "q.value < 0.1") +
## theme_classic() +
## ggtitle(title)
## }
ggplot(embryo_pairwise_res_shrunk_DEtype %>% filter(isDE == TRUE), aes(x = log10(baseMean), y = log2FoldChange, color = type)) +
geom_point(data = embryo_pairwise_res_shrunk_DEtype %>% mutate(padj = replace_na(padj, 1)), shape = 16, alpha = 0.01, stroke = 0, size = 1, color = "grey") +
geom_point(shape = 16, alpha = 0.1, stroke = 0, size = 1) +
ylim(c(-10,10))+
facet_wrap(~label) +
# scale_color_manual(values = c("black", "red"), name = "q.value < 0.1") +
theme_classic()
## Warning: Removed 36 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
table(embryo_pairwise_res_shrunk_DEtype$type, embryo_pairwise_res_shrunk_DEtype$isDE)
##
## FALSE TRUE
## greater 96273 4299
## less 99418 1154
## lessAbs 91796 8776
L1_pairwise_res_shrunk <- shrunk_pairwise_array_df(stage = "L1")
MA_plot_array(L1_pairwise_res_shrunk, "L1 FACS ashr shrunk", sig = 0.01)
## Warning: Removed 3 rows containing missing values (geom_point).
L3_pairwise_res_shrunk <- shrunk_pairwise_array_df(stage = "L3")
MA_plot_array(L3_pairwise_res_shrunk, "L3 FACS ashr shrunk", sig = 0.01)
res_embryoGFPplus_vs_embryoGFPminus <- results(dds, contrast = c("group", "embryoGFPplus", "embryoGFPminus"))
res_L1GFPplus_vs_L1_GFPminus <- results(dds, contrast = c("group", "L1GFPplus", "L1GFPminus"))
res_L3GFPplus_vs_L3_GFPminus <- results(dds, contrast = c("group", "L3GFPplus", "L3GFPminus"))
par(mfrow=c(1,3),mar=c(2,2,1,1))
ylim <- c(-15,15)
# drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
sig = 0.01
plotMA(res_embryoGFPplus_vs_embryoGFPminus, ylim=ylim, main = "Embryo GFP+ vs GFP-", alpha = sig)
plotMA(res_L1GFPplus_vs_L1_GFPminus, ylim=ylim, main = "L1 GFP+ vs GFP-", alpha = sig)
plotMA(res_L3GFPplus_vs_L3_GFPminus, ylim=ylim, main = "L3 GFP+ vs GFP-", alpha = sig)
res_embryoGFPplus_vs_embryoGFPminus_ashr <- lfcShrink(dds, contrast = c("group", "embryoGFPplus", "embryoGFPminus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res_L1GFPplus_vs_L1GFPminus_ashr <- lfcShrink(dds, contrast = c("group", "L1GFPplus", "L1GFPminus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res_L3GFPplus_vs_L3GFPminus_ashr <- lfcShrink(dds, contrast = c("group", "L3GFPplus", "L3GFPminus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
# write_csv(as.data.frame(res_embryoGFPplus_vs_embryoGFPminus_apeglm) %>% rownames_to_column(var = "WBGeneID"),
# file = "../03_output/res_embryoGFPplus_vs_embryoGFPminus_apeglm.csv")
#
# write_csv(as.data.frame(res_L1GFPplus_vs_L1GFPminus_apeglm) %>% rownames_to_column(var = "WBGeneID"),
# file = "../03_output/res_L1GFPplus_vs_L1GFPminus_apeglm.csv")
#
# write_csv(as.data.frame(res_L3GFPplus_vs_L3GFPminus_apeglm) %>% rownames_to_column(var = "WBGeneID"),
# file = "../03_output/res_L3GFPplus_vs_L3GFPminus_apeglm.csv")
par(mfrow=c(1,3),mar=c(2,2,1,1))
ylim <- c(-10,10)
# drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
sig = 0.01
plotMA(res_embryoGFPplus_vs_embryoGFPminus_ashr, ylim=ylim, main = "Embryo GFP+ vs GFP-", alpha = sig)
plotMA(res_L1GFPplus_vs_L1GFPminus_ashr, ylim=ylim, main = "L1 GFP+ vs GFP-", alpha = sig)
plotMA(res_L3GFPplus_vs_L3GFPminus_ashr, ylim=ylim, main = "L3 GFP+ vs GFP-", alpha = sig)
plotCounts(dds, "WBGene00001578", intgroup = "group", main = "ges-1 read counts")
plotCounts(dds, "WBGene00001578", intgroup = "group", returnData = TRUE) %>%
separate(group, sep = "(?<=embryo)|(?<=L1)|(?<=L3)", into = c("stage", "sample"), remove = FALSE) %>%
ggplot(aes(x = sample, y = count)) +
geom_boxplot() +
geom_point() +
facet_grid(~stage) +
ggtitle("ges-1 read counts") +
theme_classic()
plotCounts(dds, "WBGene00001250", intgroup = "group", main = "elt-2 read counts")
plotCounts(dds, "WBGene00001250", intgroup = "group", returnData = TRUE) %>%
separate(group, sep = "(?<=embryo)|(?<=L1)|(?<=L3)", into = c("stage", "sample"), remove = FALSE) %>%
ggplot(aes(x = sample, y = count)) +
geom_boxplot() +
geom_point() +
facet_grid(~stage) +
ggtitle("elt-2 read counts") +
theme_classic()
# Annotate and quantify tissue specific genes
tissue_specific_genes <- read_csv(file = "../../01_tissue_specific_genes/03_output/tissue_specific_genes_220202.csv", show_col_types = FALSE)
tissue_annotated_MA <- function(in_res, de_df){
df <- as.data.frame(in_res) %>% rownames_to_column(var = "WBGeneID") %>%
left_join(tissue_specific_genes, by = "WBGeneID") %>%
mutate(tissue = replace_na(tissue, "other")) %>%
left_join(de_df %>% filter(label == "GFPplus_vs_GFPminus") %>% select(WBGeneID, type, isDE), by = "WBGeneID")
df %>% filter(isDE == TRUE) %>%
ggplot(aes(x = log10(baseMean), y = log2FoldChange, color = type)) +
geom_point(data =df %>% select(-tissue), shape = 16, alpha = 0.1, stroke = 0, size = 1, color = "grey") +
geom_point(shape = 16, alpha = 0.5, stroke = 0, size = 1) +
facet_wrap(~tissue) +
ylim(c(-10,10)) +
theme_classic()
}
tissue_annotated_MA(res_embryoGFPplus_vs_embryoGFPminus_ashr, embryo_alt_hyp_res_df)
## Warning: Removed 240 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_point).
tissue_annotated_MA(res_L1GFPplus_vs_L1GFPminus_ashr, L1_alt_hyp_res_df)
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
tissue_annotated_MA(res_L3GFPplus_vs_L3GFPminus_ashr, L3_alt_hyp_res_df)
tissue_gene_quant <- function(in_df, sig = 0.01, thresh = 1){
my_plot <- in_df %>% filter(isDE == TRUE, label == "GFPplus_vs_GFPminus") %>%
left_join(tissue_specific_genes, by = "WBGeneID") %>%
mutate(tissue = replace_na(tissue, "other"), padj = replace_na(padj, 1)) %>%
group_by(tissue, type) %>%
summarise(genes = n()) %>%
ggplot(aes(x = type, y = genes, label = genes, fill = type)) +
geom_bar(stat = "identity") +
geom_text(vjust = -0.25) +
facet_wrap(~tissue)+
ggtitle(paste("comparison: ",deparse(substitute(in_df)), "\nlfc = ",thresh," & padj < ",sig, sep = "")) +
theme_classic()
my_plot
}
tissue_gene_quant(embryo_alt_hyp_res_df)
tissue_gene_quant(L1_alt_hyp_res_df)
tissue_gene_quant(L3_alt_hyp_res_df)
# Between-stage GFP+ comparisons
res_embryoGFPplus_vs_L1GFPplus <- results(dds, contrast = c("group", "embryoGFPplus", "L1GFPplus"))
res_embryoGFPplus_vs_L3GFPplus <- results(dds, contrast = c("group", "embryoGFPplus", "L3GFPplus"))
res_L3GFPplus_vs_L1GFPplus <- results(dds, contrast = c("group", "L1GFPplus", "L3GFPplus"))
res_embryoGFPplus_vs_L1GFPplus_ashr <- lfcShrink(dds, contrast = c("group", "embryoGFPplus", "L1GFPplus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res_embryoGFPplus_vs_L3GFPplus_ashr <- lfcShrink(dds, contrast = c("group", "embryoGFPplus", "L3GFPplus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res_L3GFPplus_vs_L1GFPplus_ashr <- lfcShrink(dds, contrast = c("group", "L3GFPplus", "L1GFPplus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3),mar=c(2,2,1,1))
ylim <- c(-15,15)
# drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
# sig = 0.01
plotMA(res_embryoGFPplus_vs_L1GFPplus, ylim = ylim, main = "embryo_vs_L1", alpha = 0.01)
plotMA(res_embryoGFPplus_vs_L3GFPplus, ylim = ylim, main = "embryo_vs_L3", alpha = 0.01)
plotMA(res_L3GFPplus_vs_L1GFPplus, ylim = ylim, main = "L3_vs_L1", alpha = 0.01)
par(mfrow=c(1,3),mar=c(2,2,1,1))
ylim <- c(-15,15)
# drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
# sig = 0.01
plotMA(res_embryoGFPplus_vs_L1GFPplus_ashr, ylim = ylim, main = "embryo_vs_L1", alpha = 0.01)
plotMA(res_embryoGFPplus_vs_L3GFPplus_ashr, ylim = ylim, main = "embryo_vs_L3", alpha = 0.01)
plotMA(res_L3GFPplus_vs_L1GFPplus_ashr, ylim = ylim, main = "L3_vs_L1", alpha = 0.01)
res_between_stage_GFP_df <- data.frame(as.data.frame(res_embryoGFPplus_vs_L1GFPplus_ashr), comparison = "embryo_vs_L1") %>%
bind_rows(data.frame(as.data.frame(res_embryoGFPplus_vs_L3GFPplus_ashr), comparison = "embryo_vs_L3")) %>%
bind_rows(data.frame(as.data.frame(res_L3GFPplus_vs_L1GFPplus_ashr), comparison = "L3_vs_L1")) %>% rownames_to_column(var = "WBGeneID")
res_between_stage_GFP_df %>% replace_na(list(padj = 1)) %>% filter(padj <= 0.01) %>%
# filter(log2FoldChange >= 1 | log2FoldChange <= -1 , padj <= 0.01) %>%
ggplot(aes(x = log10(baseMean), y = log2FoldChange)) +
geom_point(shape = 16, alpha = 0.1, stroke = 0, size = 1, color = "red") +
geom_point(data = res_between_stage_GFP_df %>% replace_na(list(padj = 1)) %>% filter(padj > 0.01),shape = 16, alpha = 0.1, stroke = 0, size = 1, color = "black") +
ylim(-10,10) +
facet_grid(~comparison) +
theme_classic()
## Warning: Removed 110 rows containing missing values (geom_point).
par(mfrow=c(3,3),mar=c(2,2,1,1))
ylim <- c(-15,15)
plotMA(results(dds, contrast = c("group", "embryoGFPplus", "L1GFPplus"), lfcThreshold=thresh, altHypothesis = "greater", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "embryoGFPplus", "L1GFPplus"), lfcThreshold=thresh, altHypothesis = "less", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "embryoGFPplus", "L1GFPplus"), lfcThreshold=thresh, altHypothesis = "lessAbs", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "embryoGFPplus", "L3GFPplus"), lfcThreshold=thresh, altHypothesis = "greater", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "embryoGFPplus", "L3GFPplus"), lfcThreshold=thresh, altHypothesis = "less", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "embryoGFPplus", "L3GFPplus"), lfcThreshold=thresh, altHypothesis = "lessAbs", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "L3GFPplus", "L1GFPplus"), lfcThreshold=thresh, altHypothesis = "greater", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "L3GFPplus", "L1GFPplus"), lfcThreshold=thresh, altHypothesis = "less", alpha = sig), ylim= ylim)
plotMA(results(dds, contrast = c("group", "L3GFPplus", "L1GFPplus"), lfcThreshold=thresh, altHypothesis = "lessAbs", alpha = sig), ylim= ylim)
# plotPCA(all_samples_rld, intgroup = "group")
Fix this to reflect new altHypothesis approach
all_samples_rld <- rlog(dds)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
# write_rds(all_samples_rld, file = "../03_output/all_samples_rlog_counts.rds")
all_samples_rld <- read_rds(file = "../03_output/all_samples_rlog_counts.rds")
all_samples_rld_df <- as.data.frame(assay(all_samples_rld)) %>% rownames_to_column(var = "WBGeneID")
head(all_samples_rld_df)
## WBGeneID embryo_cells_rep1 embryo_GFPplus_rep1 embryo_GFPminus_rep1
## 1 WBGene00021406 6.197871 6.1818642 6.6865312
## 2 WBGene00021407 1.849025 3.4500713 2.9081820
## 3 WBGene00021408 3.702000 3.4062677 0.7159172
## 4 WBGene00021404 1.055008 3.3744122 0.5814498
## 5 WBGene00001492 9.811413 10.0906548 9.8115389
## 6 WBGene00021403 1.541390 0.9198749 1.4125466
## embryo_whole_rep2 embryo_cells_rep2 embryo_GFPplus_rep2 embryo_GFPminus_rep2
## 1 5.519894 5.984528 7.182480 5.5494918
## 2 2.999011 2.288092 1.259217 1.3327583
## 3 1.194851 1.825601 3.555042 0.7887810
## 4 1.956233 1.349358 3.143460 0.6462158
## 5 8.960245 9.407138 9.360898 9.2001166
## 6 2.156149 3.630781 1.312916 1.5849015
## embryo_whole_rep3 embryo_GFPplus_rep3 embryo_GFPminus_rep3 L1_whole_rep1
## 1 6.342218 6.108270 5.2372098 6.237888
## 2 4.623845 2.328471 2.7607464 4.961974
## 3 1.259849 3.160220 0.7272470 1.728401
## 4 1.112739 3.117216 0.5915073 1.050472
## 5 9.546343 9.603875 8.9331675 9.190692
## 6 2.142062 1.264118 1.3239725 2.441694
## L1_cells_rep1 L1_GFPplus_rep1 L1_GFPminus_rep1 L1_whole_rep3 L1_cells_rep3
## 1 6.155500 7.1258340 5.3815004 8.695192 5.1911395
## 2 2.131619 1.2923964 3.1043893 3.439540 1.3916658
## 3 2.114088 3.8613046 0.7513098 2.494565 2.4087845
## 4 2.179549 3.2909460 0.9764870 2.438588 2.0567409
## 5 8.626417 9.8961711 8.3745063 9.296279 8.7424167
## 6 1.678511 0.9065791 2.4607359 2.876776 0.9682353
## L1_GFPplus_rep3 L1_GFPminus_rep3 L3_whole_rep1 L3_cells_rep1 L3_GFPplus_rep1
## 1 5.804697 5.429372 5.2835331 6.0899893 5.296188
## 2 1.384288 2.065005 3.8123905 2.6374819 2.497470
## 3 2.375504 1.565808 0.8219569 0.7761215 2.964521
## 4 1.831484 1.403129 1.2062139 0.6349513 1.677340
## 5 9.427606 9.101716 8.7563080 8.7258012 8.935722
## 6 0.963559 1.004547 2.9233297 2.1432108 1.628513
## L3_GFPminus_rep1 L3_whole_rep2 L3_cells_rep2 L3_GFPminus_rep2 L3_GFPplus_rep2
## 1 4.418886 5.1975366 4.8940623 6.4533042 6.5896291
## 2 2.481757 3.4769373 1.8336080 1.8752303 2.0085699
## 3 0.771453 1.2323343 0.8132916 0.8263903 1.5091923
## 4 1.044907 0.6409937 0.6680345 0.6796981 0.7181847
## 5 8.020950 8.9977983 8.6980941 7.7988594 8.5639547
## 6 1.707117 2.2888940 1.3883711 1.4247656 0.9904202
## L3_whole_rep3 L3_cells_rep3 L3_GFPplus_rep3 L3_GFPminus_rep3
## 1 7.2608024 5.1476312 4.970793 5.4765110
## 2 3.7532881 2.3455746 2.134670 2.0336037
## 3 0.8057572 0.7727658 1.635552 0.7873616
## 4 1.1558275 0.6319660 1.470054 0.6449526
## 5 9.1092990 8.3707058 9.071061 7.7687544
## 6 2.6076354 1.7134489 1.022428 1.3149435
thresh = 1
sig = 0.01
embryo_rlog_status_df <- all_samples_rld_df %>%
select(WBGeneID, embryo_GFPplus_rep1, embryo_GFPplus_rep2, embryo_GFPplus_rep3) %>%
pivot_longer(cols = embryo_GFPplus_rep1:embryo_GFPplus_rep3, values_to = "rlog_counts") %>%
separate(name, sep = "_", into = c("stage", "sample", "rep")) %>%
group_by(WBGeneID) %>%
summarise(mean.rlog.counts = mean(rlog_counts), var.rlog.counts = var(rlog_counts)) %>%
left_join(embryo_alt_hyp_res_df %>% filter(label == "GFPplus_vs_GFPminus") %>% select(WBGeneID, type, isDE), by = "WBGeneID")
head(embryo_rlog_status_df)
## # A tibble: 6 × 5
## WBGeneID mean.rlog.counts var.rlog.counts type isDE
## <chr> <dbl> <dbl> <chr> <lgl>
## 1 WBGene00000001 10.1 0.00454 greater FALSE
## 2 WBGene00000001 10.1 0.00454 less FALSE
## 3 WBGene00000001 10.1 0.00454 lessAbs FALSE
## 4 WBGene00000002 10.9 0.131 greater FALSE
## 5 WBGene00000002 10.9 0.131 less FALSE
## 6 WBGene00000002 10.9 0.131 lessAbs FALSE
# write_csv(embryo_rlog_status_df, file = "../03_output/embryo_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
# embryo_rlog_status_df <- read_csv(file = "../03_output/embryo_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
Make function
thresh = 1
sig = 0.01
rlog_status <- function(stage, res, hyp_df){
all_samples_rld_df %>%
select(WBGeneID, contains(paste(stage,"GFPplus", sep = "_"))) %>%
pivot_longer(cols = contains("GFPplus"), values_to = "rlog_counts") %>%
separate(name, sep = "_", into = c("stage", "sample", "rep")) %>%
group_by(WBGeneID) %>%
summarise(mean.rlog.counts = mean(rlog_counts), var.rlog.counts = var(rlog_counts)) %>%
left_join(hyp_df %>% filter(label == "GFPplus_vs_GFPminus") %>% select(WBGeneID, type, isDE), by = "WBGeneID")
}
embryo_rlog_status_df <- rlog_status(stage = "embryo", res = res_embryoGFPplus_vs_embryoGFPminus, hyp_df = embryo_alt_hyp_res_df)
# head(embryo_rlog_status_df)
# write_csv(embryo_rlog_status_df, file = "../03_output/embryo_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
# embryo_rlog_status_df <- read_csv(file = "../03_output/embryo_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
L1_rlog_status_df <- rlog_status(stage = "L1", res = res_L1GFPplus_vs_L1_GFPminus, hyp_df = L1_alt_hyp_res_df)
# write_csv(L1_rlog_status_df, file = "../03_output/L1_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
# L1_rlog_status_df <- read_csv(file = "../03_output/L1_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
# head(L1_rlog_status_df)
L3_rlog_status_df <- rlog_status(stage = "L3", res = res_L3GFPplus_vs_L3_GFPminus, hyp_df = L3_alt_hyp_res_df)
# write_csv(L3_rlog_status_df, file = "../03_output/L3_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
# L3_rlog_status_df <- read_csv(file = "../03_output/L3_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
# head(L3_rlog_status_df)
intestine_enriched_genes <- data.frame(embryo_rlog_status_df, stage = "embryo") %>% bind_rows(data.frame(L1_rlog_status_df, stage = "L1"), data.frame(L3_rlog_status_df, stage = "L3")) %>%filter(type == "greater", isDE == TRUE) %>% select(WBGeneID, stage)
stage_list<- list(embryo = filter(intestine_enriched_genes, stage == "embryo")$WBGeneID,
L1 = filter(intestine_enriched_genes, stage == "L1")$WBGeneID,
L3 = filter(intestine_enriched_genes, stage == "L3")$WBGeneID)
comb_mat <-make_comb_mat(stage_list)
UpSet(comb_mat)
comb_size(comb_mat)
## 111 110 101 011 100 010 001
## 883 134 134 315 691 231 164
gfpplus_matrix <- all_samples_rld_df %>% select(WBGeneID, contains("GFPplus")) %>% filter(WBGeneID %in% intestine_enriched_genes$WBGeneID) %>% column_to_rownames(var = "WBGeneID") %>% as.matrix()
nrow(gfpplus_matrix)
## [1] 2552
Mean and SD thresholds
sd_thresh <- 1
mean_thresh <- 6
hist(rowMeans(gfpplus_matrix), main = "GFPplus rlog row means")
abline(v = mean_thresh, col = "red", lty = 2)
hist(rowSds(gfpplus_matrix), main = "GFPplus rlog row SD")
abline(v = sd_thresh, col = "red", lty = 2)
plot(rowMeans(gfpplus_matrix), rowSds(gfpplus_matrix), pch = 20, cex = 0.1)
abline(h = sd_thresh, v = mean_thresh, col = "red", lty = 2)
# abline(, col = "red", lty = 2)
gfpplus_matrix_row_zscore <- (gfpplus_matrix - rowMeans(gfpplus_matrix))/rowSds(gfpplus_matrix)
pheatmap(gfpplus_matrix_row_zscore,
cluster_cols = FALSE,
show_rownames = FALSE,
# cutree_rows = 5,
main = paste0("Dynamics of intestine enriched genes\n unthresholded\n # genes = ", nrow(gfpplus_matrix_row_zscore), sep = ""))
pheatmap(gfpplus_matrix_row_zscore[rowSds(gfpplus_matrix) >= sd_thresh & rowMeans(gfpplus_matrix) >= mean_thresh,],
cluster_cols = TRUE,
show_rownames = FALSE,
# cutree_rows = 5,
main = paste0("Dynamics of intestine enriched genes\n thresholds: row mean >= ", mean_thresh, ", row SD >= ", sd_thresh,
"\n # genes = ", nrow(gfpplus_matrix_row_zscore[rowSds(gfpplus_matrix) >= sd_thresh & rowMeans(gfpplus_matrix) >= mean_thresh,])))
# Heatmap of transcription factors
wtf3 <- read_table(file = "../../../David/01_promoters/01_input/wtf3.wbid", col_names = FALSE)$X1
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character()
## )
## Warning: 1 parsing failure.
## row col expected actual file
## 23 -- 1 columns 2 columns '../../../David/01_promoters/01_input/wtf3.wbid'
gfpplus_matrix_row_zscore_TFs <- gfpplus_matrix_row_zscore[rownames(gfpplus_matrix_row_zscore) %in% wtf3,]
pheatmap(gfpplus_matrix_row_zscore_TFs ,
cluster_cols = FALSE,
show_rownames = FALSE,
main = paste0("Dynamics of intestine enrcihed transcription factors\n unthresholded\n # genes =",
nrow(gfpplus_matrix_row_zscore_TFs ), sep = "")
)
gfpplus_matrix_TFs <- gfpplus_matrix[rownames(gfpplus_matrix) %in% wtf3,]
plot(rowMeans(gfpplus_matrix_TFs), rowSds(gfpplus_matrix_TFs))
abline(h = 1, col = "red", lty = 2)
pheatmap(gfpplus_matrix_row_zscore_TFs[rowSds(gfpplus_matrix_TFs) >= 1,] ,
cluster_cols = FALSE,
show_rownames = FALSE,
main = paste0("Dynamics of intestine enrcihed transcription factors\n theshold: row SD >= 1\n # genes =",
nrow(gfpplus_matrix_row_zscore_TFs[rowSds(gfpplus_matrix_TFs) >= 1,]), sep = "")
)